This is an attempt at a streamlined and yet complete, relatively a priori/non- snooped model analysis. Note that this version is an overhaul, with some missing components from the older results: see MixedEffects_older.Rmd

Some parts of the analysis have been moved into separate .R files: the overall workflow is

source("utils.R")
source("gamm4_utils.R")

Packages used/versions:

load_all_pkgs()
theme_set(theme_bw())
zmargin <- theme(panel.spacing=grid::unit(0,"lines"))
zmarginx <- theme(panel.spacing.x=grid::unit(0,"lines"))
nolegend <- theme(legend.position="none")
library('pander')  ## for tables
sapply(pkgList,function(x) format(packageVersion(x)))
##          lme4         gamm4         bbmle   broom.mixed          brms 
## "1.1.21.9002"       "0.2.5"    "1.0.23.1"  "0.2.4.9000"      "2.11.1" 
##    data.table       lattice     gridExtra       ggplot2       viridis 
##      "1.12.8"     "0.20.38"         "2.3"       "3.2.1"       "0.5.1" 
##        plotly       cowplot      ggstance          plyr         dplyr 
##       "4.9.1"       "1.0.0"       "0.3.3"       "1.8.5"       "0.8.4" 
##         tidyr        tibble         remef        r2glmm        raster 
##       "1.0.2"       "2.1.3"  "1.0.6.9000"       "0.1.2"      "3.0.12" 
##         rgdal        fields       plotrix            sp 
##       "1.4.8"        "10.3"       "3.7.7"       "1.3.2"
comb_out <- function(p,fn,...) {
    print(p)
    htmlwidgets::saveWidget(ggplotly(p,...),fn)
}

Data from Enric (includes area and lat/long coordinates):

ecoreg <- readRDS("ecoreg.rds")

Limit all observations with all predictor variables > 0 and no biome 98/99; set biome as factor. Log and scale/center variables as appropriate. This leaves us with 26 variables and 620 observations.

Model fitting

This can now be done semi-automatically (i.e., fit all combinations of random effects) by using the function fit_all from the utils.R file (sourced above), e.g. fit_all(response="mbirds_log")

The fit_batch.R code has already run all random-effects model combinations for all four response variables, both with lme4 and with gamm4. sum_batch.R reduces these to lists with components sum (summaries: AIC, singularity, etc.); coef (coefficients); and pred (fitted, residuals, etc.).

Load data: summaries containing

lme4_res <- readRDS("allfits_sum_lme4.rds")  ## 4 taxa x 27 fits, using lmer
gamm4_res <- readRDS("allfits_sum_gamm4.rds") ## 4 taxa x 27 fits, using gamm4

Table of models with AIC<10:

aictab1 <- lme4_res$sum %>%
    filter(AIC<8) %>%
    arrange(taxon,AIC) %>%
    mutate(AIC=round(AIC,1),
           model=shorten_modelname(model))
pander(dplyr::select(aictab1,-best),emphasize.strong.rows=which(aictab1$best))
taxon model AIC singular df
mamph_log b=i/fr=i/b_FR=d 0 TRUE 20
mamph_log b=d/fr=i/b_FR=i 2 TRUE 20
mamph_log b=d/fr=d/b_FR=i 3.9 TRUE 24
mamph_log b=d/fr=i/b_FR=d 4 TRUE 24
mamph_log b=i/fr=i/b_FR=f 5 TRUE 30
mamph_log b=i/fr=d/b_FR=d 5.1 TRUE 24
mbirds_log b=i/fr=i/b_FR=d 0 FALSE 20
mbirds_log b=i/fr=d/b_FR=f 3 TRUE 34
mbirds_log b=i/fr=d/b_FR=d 4 TRUE 24
mbirds_log b=i/fr=i/b_FR=f 5.7 TRUE 30
mbirds_log b=d/fr=i/b_FR=d 7.5 TRUE 24
mmamm_log b=i/fr=d/b_FR=f 0 TRUE 34
mmamm_log b=i/fr=i/b_FR=f 4 TRUE 30
mmamm_log b=d/fr=d/b_FR=f 7.2 TRUE 38
plants_log b=i/fr=i/b_FR=d 0 TRUE 20
plants_log b=d/fr=i/b_FR=i 0.2 TRUE 20
plants_log b=i/fr=i/b_FR=f 1.6 TRUE 30
plants_log b=d/fr=i/b_FR=d 3.4 TRUE 24
plants_log b=d/fr=i/b_FR=f 5.1 TRUE 34
plants_log b=d/fr=d/b_FR=i 6.4 TRUE 24
plants_log b=i/fr=d/b_FR=d 6.5 TRUE 24

Best (non-singular) models only:

get_best <- . %>% .[["sum"]] %>% filter(best) %>% dplyr::select(-c(best,singular))
all_best_sum <- purrr::map_dfr(list(lme4=lme4_res,gamm4=gamm4_res), get_best, .id="type")
pander(all_best_sum)
type taxon model AIC df
lme4 mbirds_log biome=int/flor_realms=int/biome_FR=diag 0 20
lme4 mmamm_log biome=int/flor_realms=diag/biome_FR=diag 10.3 24
lme4 mamph_log biome=int/flor_realms=int/biome_FR=int 17.25 16
gamm4 mbirds_log biome=int/flor_realms=int/biome_FR=diag 0 21
gamm4 plants_log biome=int/flor_realms=int/biome_FR=diag 1.81 21
gamm4 mamph_log biome=int/flor_realms=int/biome_FR=diag 7.614 21
gamm4 mmamm_log biome=int/flor_realms=diag/biome_FR=int 20.16 21

Our current strategy is to (1) take the best non-singular model fitted by lme4; (2) find the coefficients of the corresponding gamm4 model. (Below, we also try taking the best non-singular gamm4 model.)

get_best_tm <- . %>% get_best() %>% dplyr::select(taxon, model)
lme4_best_models <- lme4_res %>% get_best_tm()
gamm4_best_models <- gamm4_res %>% get_best_tm()
## keep only predictions from best models
gamm4_best_pred <- gamm4_res$pred %>%
    right_join(lme4_best_models,by=c("taxon","model"))

Fitted vs residual for all four taxa:

ggplot(gamm4_best_pred,aes(.fitted,.resid))+
    geom_point()+geom_smooth()+
    facet_wrap(~taxon,nrow=1)+zmargin

A little bit heavy-tailed …

## https://stackoverflow.com/questions/40598011/how-to-customize-hover-information-in-ggplotly-object
ggqq <- ggplot(gamm4_best_pred,aes(sample=.resid,
                                   colour=biome,
                                   flor_realms=flor_realms))+
    stat_qq()+stat_qq_line(aes(group=taxon))+
    facet_wrap(~taxon,nrow=1)+zmargin
comb_out(ggqq,"ggqq.html",tooltip=c("biome","flor_realms"))

(dynamic alternative)

coefficient plot of all models tried

For example (was using plants, now birds because we might not be fitting the models for plants):

ex <- "mbirds_log"
gamm4_allcoef <- get_allcoefs(gamm4_res,focal_taxon=ex) %>% add_wald_ci()
lme4_allcoef <- get_allcoefs(lme4_res,focal_taxon=ex) %>% add_wald_ci()
gg_allcoef <- ggplot(gamm4_allcoef,aes(estimate,model,colour=singular,shape=singular))+
    ## use point + linerange because some RE are missing std.errors
    geom_point()+
    geom_linerangeh(aes(xmin=lwr,xmax=upr))+
    facet_wrap(~term,scale="free_x")+
    geom_vline(xintercept=0,lty=2)+
    scale_colour_brewer(palette="Set1")+
    zmargin

All gamm4 coefs for mbirds_log:

print(gg_allcoef)

all lme4 coefs for mbirds_log

print(gg_allcoef %+% lme4_allcoef)

to do: why so many singular now … ?

to do: redo profiling/profile plots, comparison of Wald/profile CIs (at least for best models …

Check difference between Wald and likelihood profile confidence intervals. In principle profile CIs are more accurate - if the computations have run reliably … but I would probably be conservative and take the wider of the two.

Pull out coefs from best models for lme4, gamm4 (from lme4-best model), gamm4 (from gamm4-best model)

## lmer-best fits fitted via gamm4
gamm4_best_coef <- gamm4_res$coef %>%
    right_join(lme4_best_models,by=c("taxon","model")) %>%
    add_wald_ci %>% drop_intercept
## gamm4-best fits
gamm4_best2_coef <- gamm4_res$coef %>%
    right_join(gamm4_best_models,by=c("taxon","model")) %>%
    add_wald_ci %>% drop_intercept
## lmer-best fits
lme4_best_coef <- lme4_res$coef %>%
    right_join(lme4_best_models,by=c("taxon","model")) %>%
    add_wald_ci %>% drop_intercept
all_best_coef <- bind_rows(list(lme4=lme4_best_coef,gamm4=gamm4_best_coef,
                                gamm4_2=gamm4_best2_coef),
                           .id="type")

to do: plot all coefficients including std devs (need to combine group with term for random effects)

Exclude random effects and NPP_log_sc (which is large and significant for all taxa). Abuse warning: coloring significant effects. Snooping warning: counting number of sig values for each model type!

There are not a lot of effects other than NPP_log_sc that are consistently large and significant; perhaps main effects of fire for birds and mammals. Amphibians have considerably larger effects (the last three: Feat cv and its interactions).

Enric and I decided that it was probably best to go with the gamm4-best models, with some caution expressed about the effects that were marginal. Here are the final(ish?) results, with NPP effects added back in:

Plotting functions

Load best non-singular gamm4 fits:

best_models <- readRDS("bestmodels_gamm4.rds")
names(best_models)
## [1] "mamph_log"  "mbirds_log" "mmamm_log"  "plants_log"

plotfun() takes arguments:

bm <- best_models[[ex]]
p1A <- plotfun(bm)+nolegend
p1B <- plotfun(bm,backtrans=TRUE)+nolegend
p1C <- plotfun(bm,backtrans=TRUE,log="xy")+nolegend
cowplot::plot_grid(p1A,p1B,p1C,labels="auto")

This plot shows (a) observed points and fit on log scale (using “fire eaten” as aux variable, i.e. lines show predictions for 0.1/0.5/0.9 quantiles of fire-eaten); (b) points and back-transformed predictions; (c) the same, but with a constrained scale; (d) back-transformed preds with scaled axes (scale_*_log10()) (scale also constrained)

Partial residuals:

p1D <- plotfun(bm,respvar="partial_res")+nolegend
p1E <- plotfun(bm,respvar="partial_res",backtrans=TRUE)+nolegend
p1F <- plotfun(bm,respvar="partial_res",backtrans=TRUE,log="xy")+nolegend
partial_res_plot <- cowplot::plot_grid(p1D,p1E,p1F,labels="auto")
print(partial_res_plot)

\(R^2\) across taxa

Only a few effects have partial \(R^2\) values of more than a few percent: NPP (of course), fire (for mammals and ?birds?), and fire CV (for amphibians). Everything else is going to be pretty subtle (provided of course that we trust this particular way of estimating \(R^2\)).

all_rsq <- bind_rows(lapply(best_models,
                            function(x) r2beta(x$mer)),.id="taxon") %>%
    mutate(Effect=as.character(Effect),
           Effect=gsub("^X","",Effect),
           Effect=reorder(factor(Effect),Rsq))           
rsqplot <- ggplot(all_rsq,aes(Rsq,Effect,colour=taxon,shape=taxon))+
    geom_pointrangeh(aes(xmin=lower.CL,xmax=upper.CL),
                   position=position_dodgev(height=0.5))+
    scale_colour_brewer(palette="Dark2")+
    ## scale_x_log10(limits=c(1e-2,1),oob=scales::squish)+
    labs(y="")
print(rsqplot)

Plot in two panels:

all_rsq$upper <- with(all_rsq,Effect %in% c("Model","NPP_log_sc","log(area_km2)"))
r1 <- rsqplot %+% subset(all_rsq,upper)
r2 <- rsqplot %+% subset(all_rsq,!upper)
## facet_wrap doesn't have 'space' argument ...
## rsqplot %+% all_rsq +facet_wrap(~upper,ncol=1,scales="free")
## facet_grid doesn't use axes for all facets ...
## rsqplot %+% all_rsq +facet_grid(upper~.,scales="free",space="free") +
##    theme(strip.text=element_blank())
## https://cran.r-project.org/web/packages/cowplot/vignettes/shared_legends.html
L <- get_legend(r1)
## stack facets
p1 <- plot_grid(r1
                +labs(x="")
                ## rectangle indicating scale of lower box w/in upper
                + geom_rect(xmin=0,xmax=0.125,ymin=-Inf,ymax=Inf,
                            fill="black",
                            colour=NA,alpha=0.02)
                + theme(legend.position="none"),
                r2+theme(legend.position="none"),
                ncol=1,
                align="v",rel_heights=c(0.3,0.7),axis="b")
## add legend
plot_grid(p1,L,rel_widths=c(2,0.4))

Models fitted with only biome and flor_realms

(from fit_batch.R): diagonal (independent) random effects at the biome and flor realm level. One of the keys here is that the line for each biome is estimated based on the median values of the other predictors (fire, NPP CV, area, etc.) for that particular biome, not the global median …

allfits_restr_gamm4 <- readRDS("allfits_restr_gamm4.rds")
predList <- lapply(allfits_restr_gamm4,
       predfun,
       auxvar=NULL,grpvar="biome",
       re.form=~(1+NPP_log_sc|biome))
set.focal <- function(n,d) { d$focal <- d[[n]]; return(d) }
predList <- Map(set.focal,names(predList),predList)
predFrame <- bind_rows(predList,.id="taxon")
dList <- setNames(replicate(4,ecoreg,simplify=FALSE),names(predList))
dList <- Map(set.focal,names(dList),dList)
dFrame <- bind_rows(dList,.id="taxon")
ggplot(predFrame,aes(x=NPP_log_sc,y=focal,colour=biome))+
    geom_line(lwd=1.5)+
    geom_point(data=dFrame,aes(shape=flor_realms),alpha=0.7)+
    facet_wrap(~taxon)+zmargin+
    labs(y="log diversity")+
    theme(legend.position="bottom")

Version with Feat_log_sc (main effect of fire) as focal predictor:

predList_fire <- lapply(allfits_restr_gamm4,
       predfun,
       xvar="Feat_log_sc",
       auxvar=NULL,grpvar="biome",
       re.form=~(1+Feat_log_sc|biome))
predList_fire <- Map(set.focal,names(predList_fire),predList_fire)
predFrame_fire <- bind_rows(predList_fire,.id="taxon")
(gpbf1 <- ggplot(predFrame_fire,aes(x=Feat_log_sc,y=focal,colour=biome))+
    geom_line(lwd=1.5)+
    geom_point(data=dFrame,aes(shape=flor_realms),alpha=0.7)+
    facet_wrap(~taxon)+zmargin+
    labs(y="log diversity")+
    theme(legend.position="bottom"))

gpbf1 + geom_ribbon(colour=NA,aes(ymin=lwr,ymax=upr,group=biome),alpha=0.2)

Plot random effects estimates with \(\pm\) 2 SE (these effects do not include the effects of the variation of levels of NPP and fire across biomes/realms; they represent deviations from the expectation based on the population-level effects …)

do.call(plot_grid,ggL)

ff <- filter(tt_all, taxon=="mbirds_log", group=="biome",term=="NPP_log_sc")
gg2A %+% ff

Extract coefficients

In this particular case the effect of NPP only varies across floristic realms, so the picture isn’t especially pretty:

coefs_plants <- merge_coefs(ecoreg,best_models[[1]])
ggplot(coefs_plants,aes(x,y,colour=NPP_log_sc))+
    geom_point()+
    scale_colour_viridis()